drive <- "C:" code.dir <- paste(drive, "DM_Code", "Section_10", sep="/") data.dir <- paste(drive, "DM_Data", "Section_10", sep="/") # source(paste(code.dir, "ReadFleas.r", sep = "/")) # We need this for "class.ind" function library(nnet) library(tree) make.tree <- function(formula, data, min.per.leaf, what.measure = "deviance") { # formula: the formula that was used in the call # e.g make.tree(formula = flea.species ~ ., # data = sdf.flea, what.measure = "deviance") # data: the data frame # what.measure: deviance, gini, ggini # min.per.leaf: minimum size for a leaf to prevent overfitting m.call <- match.call(expand = FALSE) # m$what.measure is "deviance", "gini", "ggini" so we remove it m.call$what.measure <- NULL # as well as min.per.leaf m.call$min.per.leaf <- NULL # We need to change the name so that we do not keep calling this function m.call[[1]] <- as.name("model.frame.default") m <- eval.parent(m.call) class <- model.response(m) pred <- m[, -1] # Get measures at the root info to start the process measure <- get.measure (pred[, 1], class, min(pred[,1]) - 1, what.measure) # We start to assemble the tree list of which the which the # first element is a data frame with 6 columns # - vars, n, dev, yval, splits, yprob # where splits is a matrix with 2 cols. and yprob has # of classes cols. split.info <- data.frame(matrix(0, ncol(pred), 1)) n.class <- length(levels(class)) dimnames(split.info) <- list(colnames(pred), "split") rownames(split.info)[1] <- "" split.info$measure <- array(0, c(ncol(pred), 2)) split.info$measure[1, 2] <- measure$right.measure colnames(split.info$measure) <- c("left", "right") split.info$left.prob <- array(0, c(ncol(pred), n.class)) colnames(split.info$left.prob) = levels(class) pos <- 4 + n.class split.info$right.prob <- array(0, c(ncol(pred), n.class)) colnames(split.info$right.prob) = levels(class) pos <- pos + n.class split.info$left.count <- array(0, c(ncol(pred), n.class)) colnames(split.info$left.count) = levels(class) pos <- pos + n.class split.info$right.count <- array(0, c(ncol(pred), n.class)) colnames(split.info$right.count) = levels(class) yprob <- measure$right.yprob dev <- measure$right.measure split.info$right.count[1, ] <- apply(class.ind(class), 2, sum) split.info$right.prob[1, ] <- yprob # Find the dominant class at the root yval <- levels(class)[which(max(yprob) == yprob)] top.class <- levels(class)[which(max(yprob) == yprob)][1] cat("1) root", length(class), measure[[2]],top.class,"(",yprob,")", "\n") # Start it info <- do.tree(pred, class, 1, {}, split.info, 1, measure$right.measure, what.measure, min.per.leaf) # We complete the tree structure node.numb <- {} for (i in 1:length(info)) { node.numb <- c(node.numb, info[[i]]$node) } (node.index <- 1:length(node.numb)) tree.sz <- length(node.index) var <- matrix("", tree.sz, 1) rownames(var) <- 1:tree.sz n <- matrix(0, tree.sz, 1) dev <- matrix(0, tree.sz, 1) yval <- matrix("", tree.sz, 1) cuts <- matrix("", tree.sz, 2) colnames(cuts) <- c("cutleft", "cutright") probs <- matrix(0, sum(!is.na(node.index)), length(levels(class))) colnames(probs) <- sort(levels(class)) where <- {} # We will mark the values used with NA. # This means that we stop looping when all the indices are NA while(sum(!is.na(node.index)) > 1) { # Get the deepest node in the current subtree last.node.ind <- which(node.numb[node.index] == max(node.numb[node.index], na.rm = T))[1] last.node <- info[[node.index[last.node.ind]]]$node sibling.ind <- which(node.numb[node.index] == last.node - 1)[1] sibling.node <- info[[node.index[sibling.ind]]]$node rownames(var)[last.node.ind] <- last.node rownames(var)[sibling.ind] <- sibling.node # Find the parent node (parent <- last.node%/%2) (parent.ind <- which(node.numb == parent)[1]) # Set the probabilities probs[last.node.ind, ] <- round(info[[last.node.ind]]$split.info$right.prob[1,], digits = 8) probs[sibling.ind, ] <- round(info[[sibling.ind]]$split.info$left.prob[1,], digits = 8) # Set the deviance dev[last.node.ind] <- round(info[[last.node.ind]]$split.info$measure[1,2], digits = 6) dev[sibling.ind] <- round(info[[sibling.ind]]$split.info$measure[1,1], digits = 6) # Set the count n[last.node.ind] <- sum(info[[last.node.ind]]$split.info[1,]$right.count) n[sibling.ind] <- sum(info[[sibling.ind]]$split.info[1,]$left.count) var[parent.ind, 1] <- rownames(info[[last.node.ind]]$split.info)[1] if (length(info[[last.node.ind]]$where) > 0) { var[last.node.ind] <- "" } if (length(info[[sibling.ind]]$where) > 0) { var[sibling.ind] <- "" } cuts[parent.ind, 1] <- paste("<", round(info[[last.node.ind]]$split.info[1,1], digits = 3), sep="") cuts[parent.ind, 2] <- paste(">", round(info[[last.node.ind]]$split.info[1,1], digits = 3), sep="") if (var[last.node.ind] == "") { tmp.where <- rep(last.node.ind, length(info[[last.node.ind]]$where)) attr(tmp.where, "names") <- info[[last.node.ind]]$where where <- c(where, tmp.where) } if (sibling.ind > 1) { if (var[sibling.ind] == "") { tmp.where <- rep(sibling.ind, length(info[[sibling.ind]]$where)) attr(tmp.where, "names") <- info[[sibling.ind]]$where where <- c(where, tmp.where) } } node.index[last.node.ind] <- NA node.index[sibling.ind] <- NA } dev[1] <- round(info[[1]]$split.info$measure[1,2], digits = 6) n[1] <- sum(info[[1]]$split.info[1,]$right.count) probs[1, ] <- round(info[[1]]$split.info$right.prob[1,], digits = 8) frame <- data.frame(var, n, dev, yval) frame$splits <- array(cuts, c(tree.sz, 2)) frame$yprob <- probs frame$yval <- levels(class)[max.col(probs)] xlevels <- rep(list(NULL), ncol(pred)) names(xlevels) <- dimnames(pred)[[2]] my.tree <- list(frame = frame, where = where[order(as.numeric(names(where)))], terms = attr(m, "terms"), call = m.call, y = class, weights = rep(1, nrow(pred))) attrs <- list(names = c("frame", "where", "terms", "call", "y", "weights"), class = "tree", xlevels = xlevels, ylevels = sort(levels(class))) attributes(my.tree) <- attrs my.tree } do.tree <- function(pred, class, node, info, new.info, space, root.measure, what.measure, min.per.leaf, measure.drop) { # pred: the predictors # class: the known classes # info: the information about the tree # new.info: the information about the current node # space: used for printing the tree on the fiy - basically the depth # root.measure: the "deviance" at the start # what.measure: deviance, gini, ggini # min.per.leaf: minimum size for a leaf to prevent overfitting # measure.drop: not yet used - need to have the measure change significantly old.measure <- sum(new.info$measure[1,]) where <- NULL leaf <- FALSE # We try to avoid overfitting by requiring that there be 2 x the minimum # It is probable that this still will lead to leaves that are too small if (dim(pred)[1] <= 2*min.per.leaf) { leaf <- TRUE where <- as.numeric(rownames(pred)) } # No mixing if (length(unique(class)) == 1) { leaf <- TRUE where <- as.numeric(rownames(pred)) } # We use find.split to do the splitting. It returns 'split.info' # which has for each variable # the split point, left/right measures left/right probs for each class # left/right counts for each class. split.info <- find.split(pred, class, what.measure, min.per.leaf) # Get the total measures total <- apply(split.info$measure, 1, sum) # No split in min.per.leaf to (len - min.per.leaf) if (min(total) == 2*10^30) { leaf <- TRUE where <- as.numeric(rownames(pred)) } info <- c(info, list(list(node = node, where = where, split.info = new.info))) if (!leaf) { split.info.ind <- which(min(total) == total)[1] # We take the first one in case of duplicates # split.info.ind gives the row with the min. measure (var & split) # Get the directions for this split 'split.info[split.info.ind,1]' # and the measures (similar to what was done in make.tree) # Now send all the data var < split to left # We make sure that we are decreasing # if ((measure$left.measure + measure$right.measure) # < old.measure - measure.drop*root.measure) split.min <- split.info[split.info.ind,] left.index <- (pred[ ,split.info.ind] < split.info[split.info.ind, 1]) split.left <- pred[left.index,] yprob <- split.min$left.prob top.class <- levels(class)[which(max(yprob) == yprob)][1] cat(rep(" ",space), paste(2*node,")",sep = ""), dimnames(pred)[[2]][split.info.ind], "<", unlist(split.min[1]), sum(split.min$left.count), split.min$measure[1], top.class,"(",yprob,")", "\n") # Recursion on the current node info <- do.tree(split.left, class[left.index], 2*node, info, split.info[order(total),], space + 1, root.measure, what.measure, min.per.leaf, measure.drop) # If we are here we have gone as far as we can to the left # Now send all the data var > split to right right.index <- (pred[ ,split.info.ind] > split.info[split.info.ind, 1]) split.right <- pred[right.index,] yprob <- split.min$right.prob top.class <- levels(class)[which(max(yprob) == yprob)][1] cat(rep(" ",space), paste(2*node+1,")",sep = ""), dimnames(pred)[[2]][split.info.ind], ">", unlist(split.min[1]), sum(split.min$right.count), split.min$measure[2], top.class,"(",yprob,")", "\n") # Recursion info <- do.tree(split.right, class[right.index], 2*node+1, info, split.info[order(total),], space + 1, root.measure, what.measure, min.per.leaf, measure.drop) } info } find.split <- function (pred, class, what.measure, min.per.leaf) { # pred: the predictors # class: the known classes # min.per.leaf: minimum size for a leaf to prevent overfitting # Get the number of predictors and cases. numb.pred <- dim(pred)[2] numb.cases <- length(class) # split.info <- {} # We try to avoid overfitting by requiring that there be 2 x the minimum # It is probable that this still will lead to leaves that are too small used <- {} for (var.index in 1:numb.pred) { # Run through the variables var.sorted <- sort(pred[,var.index]) # Sort # If we require a minimum in each leaf we can set the search to start at # the min (say 5) and stop at (length - min) var.diffs <- diff(var.sorted) split.index <- which(var.diffs != 0) split.index <- split.index[split.index > min.per.leaf - 1] split.index <- split.index[split.index < length(var.sorted) - min.per.leaf + 1] split.lo <- split.index split.hi <- split.index + 1 split <- (var.sorted[split.lo] + var.sorted[split.hi])/2 # It is possible that there is no valid split # (i.e. no change in the variable from min.per.leaf to length - min.per.leaf # Loop though the variable measure.info <- {} if (length(split) > 0) { for (split.pt in split) { # Get the left/right for all classes # Get the measures measure.info <- rbind(measure.info, c(split.pt, unlist(get.measure(pred[, var.index], class, split.pt, what.measure)))) } min.ind <- which(measure.info[,2]+ measure.info[,3] == min(measure.info[,2]+ measure.info[,3]))[1] split.info <- rbind(split.info, measure.info[min.ind,]) } else { split.info <- rbind(split.info, matrix(10^30, 1, ncol = 1+2+4*length(levels(class)))) } } rownames(split.info) <- colnames(pred) split.info.min <- which(split.info[,2] + split.info[,3] == min(split.info[,2] + split.info[,3]))[1] info.df <- data.frame(split.info[,1]) n.class <- length(levels(class)) colnames(info.df) <- c("split") info.df$measure <- array(split.info[,2:3], c(ncol(pred), 2)) colnames(info.df$measure) <- c("left", "right") info.df$left.prob <- array(split.info[,4:(3+n.class)], c(ncol(pred), n.class)) colnames(info.df$left.prob) = levels(class) pos <- 4 + n.class info.df$right.prob <- array(split.info[,pos:(pos - 1 + n.class)], c(ncol(pred), n.class)) colnames(info.df$right.prob) = levels(class) pos <- pos + n.class info.df$left.count <- array(split.info[,pos:(pos - 1 + n.class)], c(ncol(pred), n.class)) colnames(info.df$left.count) = levels(class) pos <- pos + n.class info.df$right.count <- array(split.info[,pos:(pos - 1 + n.class)], c(ncol(pred), n.class)) colnames(info.df$right.count) = levels(class) info.df } # get.measure <- function (pred, class, split, what.measure) { # pred: the predictors # class: the known classes # split: the split point # what.measure: deviance, gini, ggini # # Find the possible classes the.classes <- levels(class) # Find the number of classes numb.classes <- length(the.classes) # If we split the current variable (var.index) what classes go left/right? to.left <- rep(0, numb.classes) to.right <- rep(0, numb.classes) left.measure <- 0 right.measure <- 0 # For each class, count the number that go left/right # based on the current splitting for (i in 1:numb.classes) { to.left[i] <- sum(class[pred < split] == the.classes[i]) to.right[i] <- sum(class[pred >= split] == the.classes[i]) } sum.left <- sum(to.left) sum.right <- sum(to.right) # if ( (what.measure == "gini") || (what.measure == "ggini")) { if (sum.left != 0) { left.measure <- 1 - sum(to.left * to.left)/(sum.left * sum.left) } if (sum.right != 0) { right.measure <- 1 - sum(to.right * to.right)/(sum.right * sum.right) } if (what.measure == "ggini") { left.measure <- left.measure * sum.left right.measure <- right.measure * sum.right } } else { # Compute the deviance for the split based on the # the number of each class going left or right. if (sum.left != 0) { left.measure <- sum.left * log(sum.left) } if (sum.right != 0) { right.measure <- sum.right * log(sum.right) } for (i in 1:numb.classes) { if (to.left[i] > 0) { left.measure <- left.measure - to.left[i] * log(to.left[i]) } if (to.right[i] > 0) { right.measure <- right.measure - to.right[i] * log(to.right[i]) } } left.measure <- 2 * sum(left.measure) right.measure <- 2 * sum(right.measure) } list(left.measure = left.measure, right.measure = right.measure, left.yprob = to.left/sum.left, right.yprob = to.right/sum.right, left.count = to.left, right.count = to.right) } min.per.leaf <- 5 # Don't overfit measure.drop <- 0.01 # We split as long as there is improvement (lev <- unique(flea.species)) len.lev <- length(lev) # Number of classes sdf.flea <- data.frame(df.flea, flea.species) # Deviance info.dev <- make.tree(flea.species ~ ., sdf.flea, min.per.leaf, what.measure = "deviance") plot(df.flea[,1],df.flea[,6],col = species + 1, xlab = "tars1", ylab = "aede3", main = "Deviance", pch = 19) lines(c(118,245),c(93.5,93.5)) lines(c(159,159),c(93.5,125)) # Gini info.g <- make.tree(flea.species ~ ., sdf.flea, min.per.leaf, what.measure = "gini") plot(df.flea[,1],df.flea[,4],col = species+1, xlab = "tars1", ylab = "aede1", main = "Gini", pch = 19) lines(c(159,159),c(113,160)) lines(c(159,243),c(133.5,133.5)) # Generalized Gini info.gg <- make.tree(flea.species ~ ., sdf.flea, min.per.leaf, what.measure = "ggini") # Ripley library(tree) (flea.tr <- tree(flea.species ~ ., sdf.flea)) # summary(flea.tr) summary(info.dev) summary(info.g) summary(info.gg) # plot(flea.tr) text(flea.tr, all = T) # Uniform spacing plot(flea.tr, type = "uniform") text(flea.tr, all = T) # for (i in 1:length(flea.tr)) { print (flea.tr[i]) } flea.tr$frame[,6] # source(paste(code.dir, "confusion_ex.r", sep = "/")) confusion.expand(predict(flea.tr, df.flea, type = "class"), factor(flea.species)) # With more info source('C:/DM_Code/Section_10/EJNplottree.r') source('C:/DM_Code/Section_10/EJNtexttree.r') plot.tree.EJN(flea.tr, type = "uniform") text.tree.EJN(flea.tr, all = T) # Plot the separations plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n") text(d.flea[,6], d.flea[,1], c(rep("C",21) ,rep("Hp",22), rep("Hk",31))) partition.tree(flea.tr, add = T) # add = T puts it on above abline(v=93.5) lines(c(93.5,125),c(159,159)) lines(c(93.5,125),c(190.5,190.5)) # t(predict(flea.tr, df.flea)) # predict(flea.tr, df.flea, type = "class") # require(tree) sdf.flea <- data.frame(df.flea, flea.species) (flea.trg <- tree(flea.species ~ ., sdf.flea, split = "gini")) plot.tree.EJN(flea.trg, type = "uniform") text.tree.EJN(flea.trg, all = T) # detach("package:tree") require(tree) sdf.flea <- data.frame(df.flea, flea.species) (flea.trng <- tree(flea.species ~ ., sdf.flea, split = "gini")) plot.tree.EJN(flea.trng, type = "uniform") text.tree.EJN(flea.trng, all = T, cex = 1) # RPART sdf.flea <- data.frame(df.flea, flea.species) library(rpart) (flea.rp <- rpart(flea.species ~ ., sdf.flea, minbucket = 5)) # summary(flea.rp) # confusion.expand(predict(flea.rp, df.flea, type = "class"), factor(flea.species)) # plot(flea.rp) text(flea.rp, all = T) # plot(flea.rp, margin = 0.03) text(flea.rp, all = T, use.n = T) # for (i in 1:length(flea.rp)) { print (flea.rp[i]) } t(predict(flea.rp, sdf.flea)) predict(flea.rp, sdf.flea, type = "class") # Surrogate flea.tr flea.rp flea.miss <- sdf.flea flea.miss[1:74,6] <- NA predict(flea.tr, flea.miss, type = "class") as.integer(predict(flea.tr, sdf.flea, type = "class"))- as.integer(predict(flea.tr, flea.miss, type = "class")) predict(flea.rp, flea.miss, type = "class") as.integer(predict(flea.rp, sdf.flea, type = "class"))- as.integer(predict(flea.rp, flea.miss, type = "class")) # Look at the synthetic classes require(tree) # load(paste(data.dir, "syn.Rdata", sep = "/")) load(paste(data.dir, "synTrainTest.Rdata", sep = "/")) # syn.train.class <- res$tp[tt.ind[[1]],] syn.train.data <- res$data[tt.ind[[1]],] # syn.test.class <- res$tp[tt.ind[[2]],] syn.test.data <- res$data[tt.ind[[2]],] # 1 df.data <- data.frame(resp = syn.train.class[ ,1], Data = syn.train.data[,1], Mining = syn.train.data[,2]) (syn.tr <- tree(as.factor(resp) ~ ., data = df.data)) plot(df.data$Data, df.data$Mining, type = "n", xlab = "Data", ylab = "Mining") text(df.data$Data, df.data$Mining, as.character(df.data$resp), col = 2 + df.data$resp) partition.tree(syn.tr, add = TRUE) # 8 df.data <- data.frame(resp = syn.train.class[ ,8], Data = syn.train.data[,1], Mining = syn.train.data[,2]) (syn.tr <- tree(as.factor(resp) ~ ., data = df.data)) plot(df.data$Data, df.data$Mining, type = "n", xlab = "Data", ylab = "Mining") text(df.data$Data, df.data$Mining, as.character(df.data$resp), col = 2 + df.data$resp) partition.tree(syn.tr, add = TRUE, cex = 1.5) # Better version source('C:/DM_Code/Section_10/EJNPartitionTree.r') CNRT.ex <- function(df.data, test.data, test.class, kind){ graphics.off() x11(width = 8, height = 4) oldpar <- par(mfrow = c(1,2), mar = c(4,4,3,2)) data.tr <- tree(as.factor(resp) ~ ., data = df.data) print(data.tr) print(summary(data.tr)) plot.tree.EJN(data.tr, type = "uniform") text.tree.EJN(data.tr, all = T, dig = 3, count = F) plot.partition(data.tr, df.data[,1], df.data[,2], df.data[,3], main = paste("Tree", kind)) print ("Training confusion matrix") print(confusion.expand(predict(data.tr, df.data, type="class"), factor(df.data[,1]))) points(test.data[,1], test.data[,2], pch = test.class + 3, col = test.class + 2) print ("Test confusion matrix") print(confusion.expand(predict(data.tr, test.data, type="class"), factor(test.class))) par(oldpar) x11(width = 8, height = 4) oldpar <- par(mfrow = c(1,2), mar = c(4,4,3,2)) data.rp <- rpart(as.factor(resp) ~ ., data = df.data, minbucket = 5) print(summary(data.rp)) plot(data.rp, uniform = T, margin = 0.05) text(data.rp, all = T) plot.partition(data.rp, df.data[,1], df.data[,2], df.data[,3], main = paste("Rpart", kind)) print ("Training confusion matrix") print(confusion.expand(predict(data.rp, df.data, type="class"), factor(df.data[,1]))) points(test.data[,1], test.data[,2], pch = test.class + 3, col = test.class + 2) print ("Test confusion matrix") print(confusion.expand(predict(data.rp, test.data, type="class"), factor(test.class))) par(oldpar) } f.menu <- function(){ while (TRUE) { cat("Enter the number of the data set","\n") resN <- menu(c("1:2", "1/3:2/4", "1:2/3:4", "1:2 rotated", "1/3:2/4 rotated", "1:2/3:4 rotated", "Hole", "y^2")) if (resN == 0) break df.data <- data.frame(resp = syn.train.class[ ,resN], X = syn.train.data[,1], Y = syn.train.data[,2]) CNRT.ex(df.data, data.frame(syn.test.data), syn.test.class[,resN], resN) bringToTop(which = -1) } } # Run the synthetics f.menu() ###################################################### # Forensic glass ###################################################### library(MASS) library(tree) data(fgl) # (fgl.tr <- tree(type ~ ., fgl)) # summary(fgl.tr) # fgl.tr$frame fgl.tr$frame$n fgl.tr$frame$yprob*fgl.tr$frame$n # Display plot(fgl.tr, cex = 0.7) text(fgl.tr, all = T, cex = 0.7) # Uniform spacing plot(fgl.tr, type = "uniform") text(fgl.tr, all = T, cex = 0.7) # With more info source(paste(code.dir, "EJNPlotTree.R", sep = "/")) source(paste(code.dir, "EJNTextTree.R", sep = "/")) plot.tree.EJN(fgl.tr, type = "uniform") text.tree.EJN(fgl.tr, all = T, cex = 0.75) # source(paste(code.dir, "plotsubtree.R", sep = "/")) plot.subtree(fgl.tr, 2, "uniform") # confusion.expand(predict(fgl.tr, fgl, type = "class"), fgl[,10]) # 3 D version attach(fgl) (fgl.tr.3 <- tree(type ~ Mg + Al + Na)) plot(fgl.tr.3, type = "uniform") text(fgl.tr.3, all = T) plot.partition.3D(fgl.tr.3, type, Mg, Al, Na, "Mg", "Al", "Na", main = "") start <- proc.time()[3] while ((i <- 36*(proc.time()[3]-start)) < 360) { cat(i, -90 + i/2, "\n") rgl.viewpoint(i, -90 + i/2); } detach(fgl) # rpart on glass library(rpart) (fgl.rp <- rpart(type ~ ., fgl, minbucket = 5)) confusion.expand(predict(fgl.rp, fgl, type = "class"), fgl[,10]) # Drop data through the tree and see how it is classified # Here all the data is used to get misclassification info # Gives predicted class probs. (fgl.tr.pred.v <- predict(fgl.tr, fgl[,1:9], type = "vector")) # Gives predicted classes (fgl.tr.pred.c <- predict(fgl.tr, fgl[,1:9], type = "class")) # To which node (fgl.tr.pred.w <- predict(fgl.tr, fgl[,1:9], type = "where")) # Drop selected predict(fgl.tr, fgl[c(109, 130, 178, 184, 189),1:9], type = "class") predict(fgl.tr, fgl[c(109, 130, 178, 184, 189),1:9], type = "class") predict(fgl.tr, fgl[c(109, 130, 178, 184, 189),1:9], type = "class") predict(fgl.tr, fgl[c(109, 130, 178, 184, 189),1:9], type = "class") fgl[c(109, 130, 178, 184,189),10] fgl.tr.pred.v[c(109, 130, 178, 184,189),] # plot(fgl.tr, type = "uniform") text(fgl.tr, all = T, cex = 0.75) snip.tree(fgl.tr) # do.snip <- function(tr, node) { snip.xx <- snip.tree(tr, node) print(snip.xx) plot.tree.EJN(snip.xx, type = "uniform", scale = 0.85) text(snip.xx, all = T, cex = 0.75) snip.xx } oldpar <- par(mfrow = c(2,2)) par(mar = c(1,2,1,2)) sn.tr <- do.snip(fgl.tr, 26) text(9,-4.7, "1", col = "red", cex = 1.5) sn.tr <- do.snip(sn.tr, 108) text(9,-4.7, "1", col = "red", cex = 1.5) text(10,-6.7, "2", col = "red", cex = 1.5) sn.tr <- do.snip(sn.tr, 109) text(9,-4.7, "1", col = "red", cex = 1.5) text(10,-6.61, "2", col = "red", cex = 1.5) text(11,-6.61, "3", col = "red", cex = 1.5) sn.tr <- do.snip(sn.tr, 31) text(9,-4.7, "1", col = "red", cex = 1.5) text(10,-6.61, "2", col = "red", cex = 1.5) text(11,-6.61, "3", col = "red", cex = 1.5) text(16,-4.47, "4", col = "red", cex = 1.5) par(oldpar) # cv.tr <- cv.tree(fgl.tr,, prune.tree, method = "misclass") plot(cv.tr) # x11() cv.tr <- cv.tree(fgl.tr,, prune.tree, method = "misclass") plot(cv.tr) # # The following runs the cross validation 5 times and takes the average. # Because the cv uses samples the results are not always the same fgl.cv <- cv.tree(fgl.tr,, prune.misclass) for(i in 2:5) { fgl.cv$dev <- fgl.cv$dev + cv.tree(fgl.tr,, prune.misclass)$dev } fgl.cv$dev <- fgl.cv$dev/5 plot(fgl.cv) # We can look at the size vs. misclassification cbind(fgl.cv$size, fgl.cv$dev) # Select a size that gives a good result - 9? # "cv.tree" will return a tree of that size with the # least misclassification. (pruned.tree <- cv.tree(fgl.tr,, best = 9, prune.misclass)) plot.tree.EJN(pruned.tree, type = "u") text.tree.EJN(pruned.tree, all = T) # Send the data down the reduced tree pred <- predict(pruned.tree, fgl[,1:9], type = "class") which(pred ! = fgl[,10]) confusion.expand(predict(fgl.tr, fgl[,1:9], type = "class"),fgl[,10]) # confusion.expand(predict(pruned.tree, fgl[,1:9], type = "class"),fgl[,10]) # Train/Test # == == == == == == == == == == == == == == == == == == == == == = = # A function to produce those indices NOT in a set. Use this to get the test sample. # == == == == == == == == == == == == == == == == == == == == == = = "%w/o%" <- function(x,y) x[!x %in% y] # == == == == == == == == == == == == == == == == == == == == == = = # Set the indices for the training/test sets # == == == == == == == == == == == == == == == == == == == == == = = get.train <- function (data.sz, train.sz) { # Take subsets of data for training/test samples # Return the indices train.ind <- unique(floor(runif(10*train.sz)*data.sz)+1)[1:train.sz] test.ind <- (1:data.sz) %w/o% train.ind list(train = train.ind, test = test.ind) } # Get a training and a test sample tt.1 <- get.train(214,140) # For rep tt.1$train <- sort(c(108, 203, 210, 142, 16, 91, 144, 191, 164, 120, 200, 74, 62, 155, 179, 133, 76, 117, 177, 73, 137, 10, 22, 176, 178, 172, 135, 28, 81, 118, 54, 98, 99, 180, 102, 170, 119, 33, 182, 212, 21, 4, 100, 60, 55, 121, 149, 42, 160, 15, 101, 198, 123, 131, 18, 156, 47, 140, 94, 13, 9, 150, 24, 168, 189, 214, 64, 92, 1, 19, 122, 162, 110, 187, 125, 195, 66, 158, 143, 141, 132, 175, 17, 45, 113, 61, 67, 86, 190, 41, 163, 96, 207, 34, 36, 138, 89, 32, 95, 116, 109, 14, 193, 48, 65, 157, 209, 111, 115, 29, 51, 196, 205, 103, 11, 184, 68, 31, 82, 136, 56, 93, 50, 59, 169, 206, 128, 38, 148, 20, 27, 83, 145, 44, 202, 183, 46, 12, 37, 201)) # tt.1$test <- sort(c(2, 3, 5, 6, 7, 8, 23, 25, 26, 30, 35, 39, 40, 43, 49, 52, 53, 57, 58, 63, 69, 70, 71, 72, 75, 77, 78, 79, 80, 84, 85, 87, 88, 90, 97, 104, 105, 106, 107, 112, 114, 124, 126, 127, 129, 130, 134, 139, 146, 147, 151, 152, 153, 154, 159, 161, 165, 166, 167, 171, 173, 174, 181, 185, 186, 188, 192, 194, 197, 199, 204, 208, 211, 213)) # Show the frequencies oldpar <- par(mfrow = c(2,1), mar = c(4, 4, 2, 1)) hist(match(fgl[,10], levels(fgl[,10])), breaks = c(0,1,2,3,4,5,6), main = "fgl", xlab = "classes") hist(match(fgl[tt.1$train,10], levels(fgl[tt.1$train,10])), breaks = c(0,1,2,3,4,5,6), main = "fgl (Training)", xlab = "classes") par(oldpar) fgl.tr.1 <- tree(type ~ ., fgl[tt.1$train,]) summary(fgl.tr.1) confusion.expand(predict(fgl.tr.1, fgl[tt.1$train,], type = "class"), fgl[tt.1$train,10]) # confusion.expand(predict(fgl.tr.1, fgl[tt.1$test,], type = "class"), fgl[tt.1$test,10]) # Repeat tt.2 <- get.train(214,140) tt.2$train <- sort(c(126, 130, 107, 38, 10, 186, 78, 87, 201, 165, 118, 184, 46, 86, 40, 97, 202, 154, 11, 21, 39, 22, 82, 49, 56, 4, 169, 53, 182, 153, 88, 139, 167, 8, 209, 138, 90, 122, 47, 150, 94, 102, 69, 34, 152, 33, 211, 35, 206, 92, 43, 117, 30, 144, 99, 55, 191, 141, 132, 16, 129, 127, 188, 142, 190, 105, 101, 24, 93, 168, 113, 112, 197, 27, 159, 17, 179, 192, 157, 163, 15, 210, 85, 175, 171, 89, 61, 198, 203, 178, 187, 74, 208, 48, 12, 76, 123, 120, 204, 7, 36, 214, 100, 13, 140, 66, 98, 5, 41, 106, 146, 119, 6, 29, 63, 31, 52, 28, 177, 32, 155, 156, 116, 42, 64, 196, 162, 84, 199, 54, 158, 73, 109, 9, 143, 194, 148, 51, 26, 96)) # tt.2$test <- sort(c(1, 2, 3, 14, 18, 19, 20, 23, 25, 37, 44, 45, 50, 57, 58, 59, 60, 62, 65, 67, 68, 70, 71, 72, 75, 77, 79, 80, 81, 83, 91, 95, 103, 104, 108, 110, 111, 114, 115, 121, 124, 125, 128, 131, 133, 134, 135, 136, 137, 145, 147, 149, 151, 160, 161, 164, 166, 170, 172, 173, 174, 176, 180, 181, 183, 185, 189, 193, 195, 200, 205, 207, 212, 213)) oldpar <- par(mfrow = c(2, 1), mar = c(4, 4, 2, 1)) hist(match(fgl[tt.2$train,10], levels(fgl[tt.2$train,10])), breaks = c(0,1,2,3,4,5,6), main = "fgl (Training-1)", xlab = "classes") hist(match(fgl[tt.2$train,10], levels(fgl[tt.2$train,10])), breaks = c(0,1,2,3,4,5,6), main = "fgl (Training-2)", xlab = "classes") par(oldpar) # fgl.tr.2 <- tree(type ~ ., fgl[tt.2$train,]) summary(fgl.tr.2) confusion.expand(predict(fgl.tr.2, fgl[tt.2$train,], type = "class"), fgl[tt.1$train,10]) # confusion.expand(predict(fgl.tr.2, fgl[tt.2$test,], type = "class"), fgl[tt.1$test,10]) # oldpar <- par(mfrow = c(2,1), mar = c(1,1,1,1)) plot.tree.EJN(fgl.tr.1, type = "uniform") text.tree.EJN(fgl.tr.1, all = T, cex = 0.75) # plot.tree.EJN(fgl.tr.2, type = "uniform") text.tree.EJN(fgl.tr.2, all = T, cex = 0.75) par(oldpar) # GINI in tree fgl.tr.g <- tree(type ~ ., fgl, split = "gini") fgl.tr.g plot(fgl.tr.g, type = "uniform") text(fgl.tr.g, all = T, cex = 0.75) library(tree) (fgl.ntr.g <- tree(type ~ ., fgl, split = "gini")) plot(fgl.ntr.g, type = "uniform") text(fgl.ntr.g, all = T) ###################### require(rpart) (fgl.rp <- rpart(type ~ ., fgl)) plot(fgl.rp, uniform = T, margin = 0.05) text(fgl.rp, all = T, use.n = T) # Bagging # We split into training/test samples N <- 194 S <- sort(sample(1:214, N)) SC <- sort((1:214) %w/o% S) train <- fgl[S,] test <- fgl[SC,] training.sample.size <- length(S) test.sample.size <- length(SC) # We create 50 new training sets by sampling the # original training set with replacement sz <- 50 replicates <- matrix(0, sz, N) for (i in 1:sz) { replicates[i,] <- sort(sample(N, N, replace = T)) } # Now, create trees for each of these 50 training sets rep.trees <- {} for (i in 1:sz) { rep.trees[[i]] <- rpart(type~., data = train[replicates[i,],]) } # See the comments in the VQ best.match <- function(table, candidates) { tmp <- {} for (i in candidates) { # Count the number that match the current candidate tmp <- c(tmp, length(which(table == i))) } candidates[which(tmp == max(tmp))[1]] } # "Drop" the test cases through the trees and record the classification # into a matrix - sz (50) votes for each case in the row pred.tree <- matrix(0, length(SC), sz) for (i in 1:sz) { pred.tree[,i] <- max.col(predict(rep.trees[[i]], test)) } pred.tree[1,] pred.tree[2,] # Count the votes for each case cbind(SC, apply(pred.tree, 1, best.match, 1:6)) # Do it 100 times N <- 194 sz <- 50 bag <- matrix(0, dim(fgl)[1], 100) for (j in 1:100) { # We split into training/test samples S <- sort(sample(1:214, N)) SC <- sort((1:214) %w/o% S) train <- fgl[S,] test <- fgl[SC,] training.sample.size <- length(S) test.sample.size <- length(SC) # We create 50 new training sets by sampling the # original training set with replacement replicates <- matrix(0, sz, N) for (i in 1:sz) { replicates[i,] <- sort(sample(N, N, replace = T)) } # Now, create trees for each of these 50 training sets rep.trees <- {} for (i in 1:sz) { rep.trees[[i]] <- rpart(type~., data = train[replicates[i,],]) } # "Drop" the test cases through the trees and record the classification # into a matrix - sz (50) votes for each case in the row pred.tree <- matrix(0, length(SC), sz) for (i in 1:sz) { pred.tree[,i] <- max.col(predict(rep.trees[[i]], test)) } # Now record the votes for the current test cases in a matrix # with each case being put into the corresponding row # and the column being the current train/test set bag[SC,j] <- apply(pred.tree, 1, best.match, 1:6) } bag.class <- apply(bag, 1, best.match, 1:6) confusion.expand(levels(fgl[,10])[bag.class], fgl[,10])